#===============================================================================
#  File:    0A3-AppendixB4-Other-Dictionaries.R
#  Date:    June, 2022 
#  Author:  Natalia Umansky
#  Purpose: Replicating figures displayed in the Appendix B.4
#===============================================================================

# LIBRARIES
#===============================================================================
require(quanteda)
require(LSX)
library(data.table)
library(rtweet)
library(stringr)
library(stringi)
library(qdapRegex)
library(tidyverse)
library(kableExtra)
library(ggplot2)
library(cowplot)

quanteda_options(threads = 8)


# IMPORT DATA
#===============================================================================

Groups <- c("Politicians", "Media", "Citizens", "Advocates", "Friends")

for (i in Groups){
  df <- fread(paste0("~/",i,"_translated.csv"), 
              select = c("user_id", "status_id", "created_at", "screen_name", "text", "hashtags"))
  assign(i, df)
}

ALL <- rbind(Media, Politicians, Citizens, Advocates, Friends)


# FUNCTIONS
#===============================================================================

# Cleaning text

cleaning <- function(x){
  corp <- corpus(x)
  
  tokens<- corp %>% 
    #corpus_reshape("sentences") %>% 
    tokens(remove_punct = TRUE) %>% 
    tokens_select("^[0-9a-zA-Z]+$", valuetype = "regex")  %>% 
    tokens_tolower() %>% 
    tokens_compound(pattern = list(c("no", "safe"), c("not", "safe"),
                                   c("no", "normal"), c("not", "normal"),
                                   c("no", "peace"), c("not", "peace"),
                                   c("new", "zealand"), c("united", "states"),
                                   c("united", "kingdom"), c("north", "korea"),
                                   c("sri", "lanka"), c("el", "paso"),
                                   c("no", "danger"), c("no", "harm"),
                                   c("not", "an", "emergency"),
                                   c("strong", "ties"),
                                   c("existential", "threat"),
                                   c("existential", "threats"),
                                   c("not", "a", "threat"),
                                   c("existential", "crisis"),
                                   c("deeply", "concearned"),
                                   c("take", "action"),
                                   c("under", "attack"),
                                   c("out", "of", "danger"))) 
  
  
  tweets.dfm <- dfm(tokens, 
                    tolower=T,
                    stem = F,
                    remove = stops,
                    remove_punct=T,
                    remove_numbers =T,
                    remove_symbols = T,
                    remove_hyphens = T,
                    verbose = T,
                    include_docvars =T)
  
  tweets.dfm <- tweets.dfm %>%
    dfm_trim(min_termfreq = 10, termfreq_type = "count",
             min_docfreq = .001, docfreq_type = "prop", verbose=T) %>%
    dfm_select(min_nchar = 2, selection = "keep")
  
  return(tweets.dfm)
}


scaled_table <- function(x, n, file_name){
  
  head <- as.data.frame(head(coef(x), n))
  
  setDT(head, keep.rownames = TRUE)[]
  
  tail <- as.data.frame(tail(coef(x), n))
  
  setDT(tail, keep.rownames = TRUE)[]
  
  table <- cbind(head, tail)
  
  write.table(table, file = file_name, append = FALSE, sep = ",",
              dec = ".", row.names = TRUE,
              col.names = TRUE,
              fileEncoding = "UTF-8")
  return(table)
}


# FITTING THE LSS MODEL
#===============================================================================
stops <- c("i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
           "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
           "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
           "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "would",
           "should", "could", "ought", "i'm", "you're", "he's", "she's", "it's", "we're", "they're", "i've", "you've",
           "we've", "they've", "i'd", "you'd", "he'd", "she'd", "we'd", "they'd", "i'll", "you'll", "he'll", "she'll",
           "we'll", "they'll", "let's", "that's",  "who's","what's","here's","there's","when's","where's","why's","how's",
           "a","an","the","and","but","if","or", "because","as", "until","while","of","at","by","for","with","about","against",
           "between","into","through","during", "before","after","above","below","to", "from","up","down", "in","out",
           "on","off","over","under", "again","further","then", "once", "here", "there","when","where","why", "how",
           "all","any","both","each","few","more","most", "other", "some", "such", "only","own","same","so", "than",
           "too","very","will", "im", "youre", "hes", "shes", "its", "were", "theyre", "ive", "youve",
           "weve", "theyve", "id", "youd", "hed", "shed", "wed", "theyd", "youll", 
           "theyll", "lets", "thats",  "whos","whats","heres","theres","whens","wheres","whys","hows", "the", "to",
           "can", "will", "rt", "ones", "hell")


ALL$text <- ALL$text %>%
  stri_replace_all_fixed(pattern= "@\\w+*", "")%>%
  rm_twitter_url() %>%
  str_replace_all("amp", "") %>%
  str_remove_all('[[:punct:]]')%>%
  str_replace_all("[^\x01-\x7F]", "") %>%  #removes emojis
  str_replace_all('[[:digit:]]+', "") %>% 
  str_replace_all("@", "") %>% 
  str_replace_all("#", "")



chunk <- 100000
n <- nrow(ALL)
r <- rep(1:ceiling(n/chunk),each=chunk)[1:n]
d <- split(ALL, r)
names <-vector()
for (i in 1:length(d)){
  names <- append(names, paste0("All_", i))
}
names(d) <- names

dfmt = list()
for (i in d){
  dfmt <- append(dfmt,list(cleaning(i)))
}

tweets.dfm <- do.call("rbind",dfmt)

# Watanabe (2019)
#===============================================================================

seeds <- list(c("crisis",
                "dangerous","harm",
                "fear",
                "tense", "hostile",
                "escalate", "extreme",
                "warn", "war"),
              c("normal", "safe",
                "benefit","confident",
                "relax" , "friendly",
                "stabilise", "stabilize",
                "moderate", "assure", "peace"))

tmod_lss<- textmodel_lss(tweets.dfm, seeds = as.seedwords(seeds), cache = TRUE, auto_weight = FALSE)
table <- scaled_table(tmod_lss, 100, "Table-A4.csv")

scaled_textplot <- textplot_scale1d(tmod_lss, highlighted = unlist(seeds), highlighted_color = "black")+
  geom_vline(xintercept = 0, colour= "red", linetype="dotted")

ggsave("Figure-A4.pdf", scaled_textplot, width = 7, height = 5)

# Trubowitz and Watanabe (2021)
#===============================================================================

seeds <- list(c('adversary', 'adversaries', 'enemy', 'enemies',
                'foe', 'foes', 'hostile'),
              c('aid', 'aids', 'ally', 'allies', 'friend', 'friends', 'peaceful'))
                 

tmod_lss<- textmodel_lss(tweets.dfm, seeds = as.seedwords(seeds), cache = TRUE, auto_weight = FALSE)
table <- scaled_table(tmod_lss, 100, "Table-A5.csv")

scaled_textplot <- textplot_scale1d(tmod_lss, highlighted = unlist(seeds), highlighted_color = "black")+
  geom_vline(xintercept = 0, colour= "red", linetype="dotted")

ggsave("Figure-A5.pdf", scaled_textplot, width = 7, height = 5)

